A. 16S analysis

Network analysis on relative abundances

In the first analysis we utilize the raw counts. As such this network will be built using relative abundance data.

# Set seed
set.seed(777)

# Import data
otus <- as.matrix(read.csv2("16S_data/OTU_table_raw.csv", fill = TRUE, header = TRUE,
                    row.names = 1))

taxa <- tax_table(as.matrix(read.csv2("16S_data/tax_table_raw.csv", fill = TRUE, header = TRUE,
                    row.names = 1))
)

phy_df <- phyloseq(otu_table(otus, taxa_are_rows = FALSE), taxa)

# filterobj <- filterTaxonMatrix(otus, minocc = 20,
#                               keepSum = TRUE, return.filtered.indices = TRUE)
# otus.f <- filterobj$mat
# taxa.f <- taxa[setdiff(1:nrow(taxa), filterobj$filtered.indices),]
# dummyTaxonomy <- colnames(tax_df); dummyTaxonomy[1] <- "Kingdom_dummy"
# taxa.f <- rbind(taxa.f, dummyTaxonomy)
# rownames(taxa.f)[nrow(taxa.f)] <- "0"
# rownames(otus.f)[nrow(otus.f)] <- "0"
# 
# # Next, we assemble a new phyloseq object with the filtered OTU and taxonomy tables.
# updatedotus <- otu_table(otus.f, taxa_are_rows = TRUE)
# updatedtaxa <- tax_table(taxa.f)
# phyloseqobj.f <- phyloseq(updatedotus, updatedtaxa)

# Prevalence filtering
phy_df_filtered <- filter_taxa(phy_df, function(x) sum(x > 30) > (0.25*length(x)), TRUE)

sp_easi <- spiec.easi(phy_df_filtered, method='mb', lambda.min.ratio=1e-2,
                           nlambda=20, icov.select.params=list(rep.num=50))
## Normalizing/clr transformation of data with pseudocount ...
## Inverse Covariance Estimation with mb ...
## Model selection with stars ...
## Done!
ig.mb <- adj2igraph(sp_easi$refit,  vertex.attr = list(name=taxa_names(phy_df_filtered)))
vsize <- Biobase::rowMax(clr(otu_table(phy_df_filtered), 1))+15
Lineage_rel <- tax_table(phy_df_filtered)[,"Lineage"]
Lineage_rel <- factor(Lineage_rel, levels = unique(Lineage_rel))
vweights <- summary(symBeta(getOptBeta(sp_easi), mode='maxabs'))
MAGs <- c(); MAGs[taxa_names(phy_df_filtered)=="Otu00001"]  <- "Limnohabitans MAG"
MAGs[taxa_names(phy_df_filtered)=="Otu00002"]  <- "Bacteroidetes MAG1"
MAGs[taxa_names(phy_df_filtered)=="Otu00003"]  <- "Bacteroidetes MAG2"
MAGs[is.na(MAGs)] <-""

# png(file = "./Figures/Figures_network/NETWORK-REL-CX-C30-A25.png", width = 9, height = 9, res = 500, units = "in")
plot_network_custom(ig.mb, phy_df_filtered, type='taxa',
             line_weight = 2, hjust = 0.5,
             point_size = 0.1, alpha = 0.01, label_size = 3.95)+
  scale_fill_brewer(palette = "Paired")+
  scale_color_brewer(palette = "Paired")+
  geom_point(aes(size = vsize, fill = Lineage_rel), alpha = 0.5,
             colour="black", shape=21)+
  guides(size = FALSE,
    fill  = guide_legend(title = "Lineage", override.aes = list(size = 5),
                         nrow = 4),
    color = FALSE)+
  theme(legend.position="bottom", legend.text=element_text(size=12),
        text = element_text(size = 12),
        plot.margin = unit(c(1,1,1,1), "cm"))+
  scale_size(range = c(5, 15))+
  geom_label_repel(aes(label = MAGs), fontface = 'bold', color = 'black',
                   box.padding = 0.35, point.padding = 0.5,
                   segment.color = 'black',
                   size = 4,
                       # Width of the line segments.
                   segment.size = 1.5,
                   # Draw an arrow from the label to the data point.
                   arrow = arrow(length = unit(0.015, 'npc')),
                   nudge_x = -0.1,
                   nudge_y = 0.6
  )

# dev.off()

Network analysis on absolute abundances

The second network will be built using absolute abundance data by multiplying the relative taxon abundances by the total cell density. The final obtained counts will expressed as nr. of cells measured in 50 µL samples. We also only consider the OTUs that were left after the prevalence filtering conducted in the network construction with relative abundance data.

# Import cell count data
cell_counts <- read.csv("16S_data/cell_counts.csv")
cell_counts$sample_title <- as.factor(cell_counts$sample_title)
# Calculate proportions
phy_df_rel <- transform_sample_counts(phy_df, function(x) x/sum(x))

# Select samples for which corresponding counts are available
cell_counts <- cell_counts[cell_counts$sample_title %in% sample_names(phy_df_rel), ]
cell_counts <- droplevels(cell_counts)
phy_df_rel <- prune_samples(sample_names(phy_df_rel) %in% cell_counts$sample_title, phy_df_rel)

# Multiply with cell counts in 50 µL of sample
otu_table(phy_df_rel) <- otu_table(phy_df_rel) * cell_counts$Number_of_cells

# Select taxa that were selected based on prevalence in previous chunk
phy_df_abs <- prune_taxa(taxa_names(phy_df_filtered), phy_df_rel)

# Round absolute abundances to integers
otu_table(phy_df_abs) <- round(otu_table(phy_df_abs), 0)

# Construct network
sp_easi_abs <- spiec.easi(phy_df_abs, method='mb', lambda.min.ratio=1e-2,
                           nlambda=20, icov.select.params=list(rep.num=50))
## Normalizing/clr transformation of data with pseudocount ...
## Inverse Covariance Estimation with mb ...
## Model selection with stars ...
## Done!
ig.mb_abs <- adj2igraph(sp_easi_abs$refit,  vertex.attr = list(name=taxa_names(phy_df_abs)))
vsize_abs <- Biobase::rowMax(clr(otu_table(phy_df_abs), 1))+15
Lineage_abs <- tax_table(phy_df_abs)[,"Lineage"]
Lineage_abs <- factor(Lineage_abs, levels = unique(Lineage_abs))
vweights_abs <- summary(symBeta(getOptBeta(sp_easi_abs), mode='maxabs'))
MAGs <- c(); MAGs[taxa_names(phy_df_abs)=="Otu00001"]  <- "Limnohabitans MAG"
MAGs[taxa_names(phy_df_abs)=="Otu00002"]  <- "Bacteroidetes MAG1"
MAGs[taxa_names(phy_df_abs)=="Otu00003"]  <- "Bacteroidetes MAG2"
MAGs[is.na(MAGs)] <-""

# Plot network inferred from absolute abundances
# png(file = "./Figures/Figures_network/NETWORK-ABS-CX-C30-A25.png", width = 9, height = 9, res = 500, units = "in")
plot_network_custom(ig.mb_abs, phy_df_abs, type='taxa',
             line_weight = 2, hjust = 0.5,
             point_size = 0.1, alpha = 0.01, label_size = 3.95)+
  scale_fill_brewer(palette = "Paired")+
  scale_color_brewer(palette = "Paired")+
  geom_point(aes(size = vsize_abs, fill = Lineage_abs), alpha = 0.5,
             colour="black", shape=21)+
  guides(size = FALSE,
    fill  = guide_legend(title = "Lineage", override.aes = list(size = 5),
                         nrow = 4),
    color = FALSE)+
  theme(legend.position="bottom", legend.text=element_text(size=12),
        text = element_text(size = 12),
        plot.margin = unit(c(1,1,1,1), "cm"))+
  scale_size(range = c(5, 15))+
  geom_label_repel(aes(label = MAGs), fontface = 'bold', color = 'black',
                   box.padding = 0.35, point.padding = 0.5,
                   segment.color = 'black',
                   size = 4,
                       # Width of the line segments.
                   segment.size = 1.5,
                   # Draw an arrow from the label to the data point.
                   arrow = arrow(length = unit(0.015, 'npc')),
                   nudge_x = -0.1,
                   nudge_y = 0.6
  )

# dev.off()

B. MetaG analysis

# Read data
mean_coverage <- read.table("./SAMPLES-SUMMARY/bins_across_samples/mean_coverage.txt", header = TRUE)
std_coverage <- read.table("./SAMPLES-SUMMARY/bins_across_samples/std_coverage.txt", header = TRUE)
bin_size <- read.table("./SAMPLES-SUMMARY/general_bins_summary.txt", header = TRUE)[, c(1, 3, 6, 9)]
total_reads <- read.table("./sample_reads.tsv", header = TRUE)
read_length <- 300

# From wide to long format
mean_coverage_long <- gather(mean_coverage, Sample_ID, coverage, 
                             SAMPLE_16:SAMPLE_65, factor_key=TRUE)

std_coverage_long <- gather(std_coverage, Sample_ID, std_coverage, 
                            SAMPLE_16:SAMPLE_65, 
                            factor_key=TRUE)

coverage_data <- data.frame(mean_coverage_long, 
                            std_coverage = std_coverage_long[,3])

# Read and add metadata
# meta <- read.csv2("metadata.csv")
# meta$Sample_ID <- gsub(meta$Sample_ID, pattern = ".", replacement = "_", fixed = TRUE)
data_total <- left_join(coverage_data, total_reads, by = "Sample_ID")
data_total <- left_join(data_total, bin_size, by = "bins")
# data_total <- left_join(data_total, meta, by =  "Sample_ID")
data_total$bins <- plyr::revalue(data_total$bins, c("BetIa_bin"="Limnohabitans_bin", "bacIa_vizbin1"="Bacteroidetes_bin1",
                                              "bacIa_vizbin2"="Bacteroidetes_bin2"))
# Calculate relative abundance of the bins
data_total$mean_rel_abundance <- 100*(data_total$coverage*data_total$bin_size)/(read_length*data_total$Total_reads)
data_total$upper_rel_abundance <- 100*((data_total$coverage+data_total$std_coverage)*data_total$bin_size)/(read_length*data_total$Total_reads)
data_total$lower_rel_abundance <- 100*((data_total$coverage-data_total$std_coverage)*data_total$bin_size)/(read_length*data_total$Total_reads)

data_total$mean_rel_abundance_map <- 100*(data_total$coverage*data_total$bin_size)/(read_length*data_total$Mapped_reads)
data_total$upper_rel_abundance_map <- 100*((data_total$coverage+data_total$std_coverage)*data_total$bin_size)/(read_length*data_total$Mapped_reads)
data_total$lower_rel_abundance_map <- 100*((data_total$coverage-data_total$std_coverage)*data_total$bin_size)/(read_length*data_total$Mapped_reads)

# Add additional column that assigns PNCs to correct MAG
data_total$Genome_id <- factor(rep(c("Limnohabitans", "Limnohabitans", "Limnohabitans", "Bacteroidetes MAG2", "Bacteroidetes MAG1", "Bacteroidetes MAG2"), 4))

# Plot genome size for all three genomes
data_total[data_total$bins %in% c("Bacteroidetes_bin1",
                                "Bacteroidetes_bin2","Limnohabitans_bin"), ] %>% 
  ggplot(aes(x = bins, y = bin_size/(4*1e6), fill = Genome_id))+
  theme_bw()+
  geom_bar(width = 0.5, stat="identity", alpha = 0.7)+
  coord_flip()+
  scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
  theme(axis.text.x = element_text(angle = 0, size = 16),
        axis.text.y = element_text(size = 16),
        axis.title.x = element_text(size = 18),
        legend.title = element_text("Genome bin"), legend.position = "top")+
  xlab("")+
  ylab("Genome size (Mbp)")+
  guides(fill = FALSE)+
  geom_hline(yintercept = 1, size = 2, linetype="dotted")

# Plot irep for all 3 genomes
data_total[data_total$bins %in% c("Bacteroidetes_bin1",
                                "Bacteroidetes_bin2","Limnohabitans_bin"), ] %>% 
  ggplot(aes(x = bins, y = mean_irep/4, fill = Genome_id))+
  theme_bw()+
  geom_bar(width = 0.5, stat="identity", alpha = 0.7)+
  coord_flip()+
  scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
  theme(axis.text.x = element_text(angle = 0, size = 16),
        axis.text.y = element_text(size = 16),
        axis.title.x = element_text(size = 18),
        legend.title = element_text(""), legend.position = "top")+
  xlab("")+
  ylab("Index of Replication (iRep)")+
  guides(fill = FALSE)+
  geom_hline(yintercept = 1.34, size = 2, linetype="dotted")

1. Phylogenetic tree

Limnohabitans

RAxML tree for Limnohabitans MAG

RAxML tree for Limnohabitans MAG

Bacteroidetes

2. Investigate MAG- and 16S-based abundances

It is clear that there is significant %GC coverage bias present. The estimated relative abundances from metagenomics do not quantitatively match with the V3-V4 16S rRNA gene amplicon data. \[Relative\ abundance =100*(\frac{mean\ coverage * bin\ size}{read\ length*total\ sample\ reads })\] Another option is to calculate relative to mapped number of reads: \[Relative\ abundance =100*(\frac{mean\ coverage * bin\ size}{read\ length*total\ sample\ reads * \%mapped\ reads})\]

Import reference relative abundances from 16S data set in order to directly compare with metagenomic data set.

df_16S <- read.table("relative_abundance_16S.tsv", header = TRUE)
df_16S_long <- gather(df_16S, Sample_ID, relative_abundance_16S, 
                             SAMPLE_16:SAMPLE_65, factor_key=TRUE)
# Subset for only the three complete genomes (not PNCs).
data_total_sb <- data_total[data_total$bins %in% c("Limnohabitans_bin", "Bacteroidetes_bin1", "Bacteroidetes_bin2"),]

p_meta <- ggplot(data = data_total_sb, aes(x = bins, y = mean_rel_abundance, fill = bins))+
  geom_point(size = 4, shape = 21, alpha = 0.7)+
  scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
  theme_bw()+
  geom_errorbar(aes(ymin=lower_rel_abundance, 
                    ymax=upper_rel_abundance, 
                    width=.1))+
  facet_grid(.~Sample_ID)+
  # ylim(0,1)+ 
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.x=element_text(size=18))+
  ylab("Mean relative abundance (%)")+
  ylim(-1,100)+
  ggtitle("Metagenomic - total reads")

# Corrected for mapped N° of reads
p_meta_mapped <- ggplot(data = data_total_sb, aes(x = bins, y = mean_rel_abundance_map, fill = bins))+
  geom_point(size = 4, shape = 21, alpha = 0.7)+
  scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
  theme_bw()+
  geom_errorbar(aes(ymin=lower_rel_abundance_map, 
                    ymax=upper_rel_abundance_map, 
                    width=.1))+
  facet_grid(.~Sample_ID)+
  # ylim(0,1)+ 
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.x=element_text(size=18))+
  ylab("Mean relative abundance (%)")+
  ylim(-1,100)+
  ggtitle("Metagenomic - mapped reads")

p_16S <- ggplot(data = df_16S_long, aes(x = bins, y = relative_abundance_16S, fill = bins))+
  geom_point(size = 4, shape = 21, alpha = 0.7)+
  scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
  theme_bw()+
  facet_grid(.~Sample_ID)+
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.x=element_text(size=18))+
  ylab("Mean relative abundance (%)")+
  ylim(-1,100)+
  ggtitle("V3-V4 16S")

grid_arrange_shared_legend(p_meta, p_meta_mapped, p_16S, ncol = 1, nrow = 3)

3. Investigate sequence characteristics within coding DNA sequences (CDS)

# First we need the files that map the gene ID to the sequence ID (linux cmd: https://github.com/rprops/MetaG_lakeMI/wiki/11.-Genome-annotation)
# These are stored in the IMG_annotation data for each genome bin

# Next, extract the %GC of each gene from the gff file
extract_gc_from_gff("./IMG_annotation/IMG_2724679690_Limnohabitans/121950.assembled.gff", outputFolder = "GC_analysis")
extract_gc_from_gff("./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/121951.assembled.gff", outputFolder = "GC_analysis")
extract_gc_from_gff("./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/121960.assembled.gff", outputFolder = "GC_analysis")

# Use these files to make dataframes mapping function (COGs/Pfams/KO) and %GC
LIMNO_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121950.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690.cog.tab.txt", gc_thresh = 0.1, output = FALSE)
## Thu Oct 19 17:20:14 2017  --- There are 2830 genes with > 0.1 %
## Thu Oct 19 17:20:14 2017  --- This is 100 % of all genes
## Thu Oct 19 17:20:14 2017  --- The 10 genes with the highest GC% are:
##      function_id                                             function_name
## 2821     COG0405                              Gamma-glutamyltranspeptidase
## 2822     COG2755                  Lysophospholipase L1 or related esterase
## 2823     COG0642                      Signal transduction histidine kinase
## 2824     COG1514                                          2'-5' RNA ligase
## 2825     COG1767                Triphosphoribosyl-dephospho-CoA synthetase
## 2826     COG1261         Flagella basal body P-ring formation protein FlgA
## 2827     COG0665                Glycine/D-amino acid oxidase (deaminating)
## 2828     COG0810 Periplasmic protein TonB, links inner and outer membranes
## 2829     COG1040                 Predicted amidophosphoribosyltransferases
## 2830     COG1240                                 Mg-chelatase subunit ChlD
##        GC
## 2821 78.5
## 2822 78.5
## 2823 78.5
## 2824 78.5
## 2825 78.6
## 2826 78.6
## 2827 79.1
## 2828 79.5
## 2829 79.7
## 2830 80.6
BAC1_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121951.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/Annotation/2724679691_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1//Annotation/2724679691.cog.tab.txt", gc_thresh = 0.1, output = FALSE)
## Thu Oct 19 17:20:14 2017  --- There are 1889 genes with > 0.1 %
## Thu Oct 19 17:20:14 2017  --- This is 100 % of all genes
## Thu Oct 19 17:20:14 2017  --- The 10 genes with the highest GC% are:
##      function_id
## 1880     COG0052
## 1881     COG0183
## 1882     COG0629
## 1883     COG0509
## 1884     COG1734
## 1885     COG0636
## 1886     COG3502
## 1887     COG1765
## 1888     COG0377
## 1889     COG4104
##                                                                               function_name
## 1880                                                                   Ribosomal protein S2
## 1881                                                           Acetyl-CoA acetyltransferase
## 1882                                                    Single-stranded DNA-binding protein
## 1883                                    Glycine cleavage system H protein (lipoate-binding)
## 1884                                       RNA polymerase-binding transcription factor DksA
## 1885 FoF1-type ATP synthase, membrane subunit c/Archaeal/vacuolar-type H+-ATPase, subunit K
## 1886                                       Uncharacterized conserved protein, DUF952 family
## 1887                                                   Uncharacterized OsmC-related protein
## 1888 NADH:ubiquinone oxidoreductase 20 kD subunit (chhain B) or related Fe-S oxidoreductase
## 1889                 Zn-binding Pro-Ala-Ala-Arg (PAAR) domain, incolved in TypeVI secretion
##        GC
## 1880 51.6
## 1881 51.6
## 1882 52.0
## 1883 52.6
## 1884 52.7
## 1885 52.8
## 1886 52.9
## 1887 53.6
## 1888 53.6
## 1889 59.3
BAC2_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121960.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698.cog.tab.txt", gc_thresh = 0.1, output = FALSE)
## Thu Oct 19 17:20:14 2017  --- There are 1797 genes with > 0.1 %
## Thu Oct 19 17:20:14 2017  --- This is 100 % of all genes
## Thu Oct 19 17:20:14 2017  --- The 10 genes with the highest GC% are:
##      function_id
## 1788     COG4675
## 1789     COG0636
## 1790     COG1501
## 1791     COG0725
## 1792     COG5434
## 1793     COG2115
## 1794     COG4225
## 1795     COG3669
## 1796     COG4588
## 1797     COG3258
##                                                                               function_name
## 1788                                      Microcystin-dependent protein  (function unknown)
## 1789 FoF1-type ATP synthase, membrane subunit c/Archaeal/vacuolar-type H+-ATPase, subunit K
## 1790                                      Alpha-glucosidase, glycosyl hydrolase family GH31
## 1791                             ABC-type molybdate transport system, periplasmic component
## 1792                                                                      Polygalacturonase
## 1793                                                                       Xylose isomerase
## 1794                                                      Rhamnogalacturonyl hydrolase YesR
## 1795                                                                     Alpha-L-fucosidase
## 1796               Accessory colonization factor AcfC, contains ABC-type periplasmic domain
## 1797                                                                           Cytochrome c
##        GC
## 1788 44.5
## 1789 44.6
## 1790 44.9
## 1791 45.0
## 1792 45.1
## 1793 45.1
## 1794 45.4
## 1795 45.5
## 1796 46.6
## 1797 46.8
LIMNO_gc_pfam <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121950.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690.pfam.tab.txt", gc_thresh = 0.1, output = FALSE)
## Thu Oct 19 17:20:15 2017  --- There are 4954 genes with > 0.1 %
## Thu Oct 19 17:20:15 2017  --- This is 100 % of all genes
## Thu Oct 19 17:20:15 2017  --- The 10 genes with the highest GC% are:
##      function_id function_name   GC
## 4945   pfam13202     EF-hand_5 79.0
## 4946   pfam16537         T2SSB 79.0
## 4947   pfam01266           DAO 79.1
## 4948   pfam03544        TonB_C 79.5
## 4949   pfam00156  Pribosyltran 79.7
## 4950   pfam11142       DUF2917 79.8
## 4951   pfam13318       DUF4089 79.8
## 4952   pfam13519         VWA_2 80.6
## 4953   pfam02120      Flg_hook 80.9
## 4954   pfam03023          MVIN 81.7
BAC1_gc_pfam <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121951.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/Annotation/2724679691_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1//Annotation/2724679691.pfam.tab.txt", gc_thresh = 0.1, output = FALSE)
## Thu Oct 19 17:20:15 2017  --- There are 3929 genes with > 0.1 %
## Thu Oct 19 17:20:15 2017  --- This is 100 % of all genes
## Thu Oct 19 17:20:15 2017  --- The 10 genes with the highest GC% are:
##      function_id function_name   GC
## 3920   pfam02803    Thiolase_C 51.6
## 3921   pfam00436           SSB 52.0
## 3922   pfam00171        Aldedh 52.2
## 3923   pfam01597         GCV_H 52.6
## 3924   pfam01258  zf-dskA_traR 52.7
## 3925   pfam00137    ATP-synt_C 52.8
## 3926   pfam06108        DUF952 52.9
## 3927   pfam02566          OsmC 53.6
## 3928   pfam01058   Oxidored_q6 53.6
## 3929   pfam05488    PAAR_motif 59.3
BAC2_gc_pfam <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121960.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698.pfam.tab.txt", gc_thresh = 0.1, output = FALSE)
## Thu Oct 19 17:20:15 2017  --- There are 3573 genes with > 0.1 %
## Thu Oct 19 17:20:15 2017  --- This is 100 % of all genes
## Thu Oct 19 17:20:15 2017  --- The 10 genes with the highest GC% are:
##      function_id   function_name   GC
## 3564   pfam13531      SBP_bac_11 46.6
## 3565   pfam13442 Cytochrome_CBB3 46.8
## 3566   pfam12779          YXWGXW 50.0
## 3567   pfam12779          YXWGXW 50.0
## 3568   pfam01391        Collagen 50.4
## 3569   pfam01391        Collagen 50.4
## 3570   pfam01391        Collagen 50.4
## 3571   pfam01391        Collagen 50.4
## 3572   pfam01391        Collagen 50.4
## 3573   pfam01391        Collagen 50.4
LIMNO_gc_KO <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121950.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690.ko.tab.txt", gc_thresh = 0.1, output = FALSE)
## Thu Oct 19 17:20:15 2017  --- There are 2164 genes with > 0.1 %
## Thu Oct 19 17:20:15 2017  --- This is 100 % of all genes
## Thu Oct 19 17:20:15 2017  --- The 10 genes with the highest GC% are:
##                                                                                         function_id
## 2155                  two-component system, OmpR family, sensor histidine kinase QseC [EC:2.7.13.3]
## 2156                                                                  2'-5' RNA ligase [EC:6.5.1.-]
## 2157                                         triphosphoribosyl-dephospho-CoA synthase [EC:2.4.2.52]
## 2158                                              flagella basal body P-ring formation protein FlgA
## 2159                                                            general secretion pathway protein B
## 2160 tRNA 5-methylaminomethyl-2-thiouridine biosynthesis bifunctional protein [EC:2.1.1.61 1.5.-.-]
## 2161 tRNA 5-methylaminomethyl-2-thiouridine biosynthesis bifunctional protein [EC:2.1.1.61 1.5.-.-]
## 2162                                                4'-phosphopantetheinyl transferase [EC:2.7.8.-]
## 2163                                                     magnesium chelatase subunit D [EC:6.6.1.1]
## 2164                                                     flagellar hook-length control protein FliK
##      function_name   GC
## 2155   EC:2.7.13.3 78.5
## 2156    EC:6.5.1.- 78.5
## 2157   EC:2.4.2.52 78.6
## 2158               78.6
## 2159               79.0
## 2160      EC:1.5.- 79.1
## 2161   EC:2.1.1.61 79.1
## 2162    EC:2.7.8.- 79.1
## 2163    EC:6.6.1.1 80.6
## 2164               80.9
BAC1_gc_KO <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121951.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/Annotation/2724679691_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1//Annotation/2724679691.ko.tab.txt", gc_thresh = 0.1, output = FALSE)
## Thu Oct 19 17:20:15 2017  --- There are 1384 genes with > 0.1 %
## Thu Oct 19 17:20:15 2017  --- This is 100 % of all genes
## Thu Oct 19 17:20:15 2017  --- The 10 genes with the highest GC% are:
##                                             function_id function_name   GC
## 1375                  single-strand DNA-binding protein               51.3
## 1376                    threonine aldolase [EC:4.1.2.5]    EC:4.1.2.5 51.3
## 1377                large subunit ribosomal protein L32               51.3
## 1378                            uncharacterized protein               51.5
## 1379                 translation initiation factor IF-2               51.5
## 1380                 small subunit ribosomal protein S2               51.6
## 1381                  single-strand DNA-binding protein               52.0
## 1382                  glycine cleavage system H protein               52.6
## 1383            F-type H+-transporting ATPase subunit c               52.8
## 1384 NADH-quinone oxidoreductase subunit B [EC:1.6.5.3]    EC:1.6.5.3 53.6
BAC2_gc_KO <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_121960.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698_gene_oid_2_seq_id.txt", 
                             functions = "./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698.ko.tab.txt", gc_thresh = 0.1, output = FALSE)
## Thu Oct 19 17:20:15 2017  --- There are 1342 genes with > 0.1 %
## Thu Oct 19 17:20:15 2017  --- This is 100 % of all genes
## Thu Oct 19 17:20:15 2017  --- The 10 genes with the highest GC% are:
##                                             function_id function_name   GC
## 1333                            uncharacterized protein               43.9
## 1334 NADH-quinone oxidoreductase subunit B [EC:1.6.5.3]    EC:1.6.5.3 44.3
## 1335            F-type H+-transporting ATPase subunit c               44.6
## 1336      alpha-D-xyloside xylohydrolase [EC:3.2.1.177]  EC:3.2.1.177 44.9
## 1337                      xylose isomerase [EC:5.3.1.5]    EC:5.3.1.5 45.1
## 1338                   alpha-L-fucosidase [EC:3.2.1.51]   EC:3.2.1.51 45.4
## 1339                            uncharacterized protein               45.5
## 1340                            uncharacterized protein               45.8
## 1341                 accessory colonization factor AcfC               46.6
## 1342             thiosulfate dehydrogenase [EC:1.8.2.2]    EC:1.8.2.2 46.8

Motivation: For COGs there exists a hierarchy allowing us to investigate whether there is a conservation of high/low %GC in certain functional gene groups. In order to do this we need to incorporate this hierarchy into the genome dataframes we have now.

4. Analysis of gene length distribution

Here we use the dataframe made in the previous section to see if there is a significant difference in the gene length of the COGs within these three consensus genomes.

Observation: They have very small genes: on average < 500bp.

# Limnohabitans MAG gene length distribution 
p_LIMNO_length <- easyGgplot2::ggplot2.histogram(data = LIMNO_gc_cog, xName = 'gene_length',
                  groupName = 'Genome', alpha = 0.5,
                  legendPosition = "top", binwidth = 0.15,
                  groupColors = col_limno,addMeanLine=TRUE, meanLineColor="black",
                  meanLineType="dashed")+ theme_bw()+ ylim(0,15)+
  labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
  ggtitle("Limnohabitans MAG")+ xlim(0,2000)

# Bacteroidetes MAG1 gene length distribution 
p_BAC1_length <- easyGgplot2::ggplot2.histogram(data = BAC1_gc_cog, xName = 'gene_length',
                  groupName = 'Genome', alpha = 0.5,
                  legendPosition = "top", binwidth = 0.15, 
                  groupColors = col_bac1,addMeanLine=TRUE, meanLineColor="black",
                  meanLineType="dashed")+ theme_bw()+ ylim(0,15)+
  labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
  ggtitle("Bacteroidetes MAG1")+ xlim(0,2000)

# Bacteroidetes MAG2 gene length distribution 
p_BAC2_length <- easyGgplot2::ggplot2.histogram(data = BAC2_gc_cog, xName = 'gene_length',
                  groupName = 'Genome', alpha = 0.5,
                  legendPosition = "top", binwidth = 0.15,
                  groupColors = col_bac2,addMeanLine=TRUE, meanLineColor="black",
                  meanLineType="dashed")+ theme_bw()+ ylim(0,15)+
  labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
  ggtitle("Bacteroidetes MAG2")+ xlim(0,2000)

grid.arrange(p_LIMNO_length, p_BAC1_length, p_BAC2_length, ncol = 3)

We can do the same for the Pfams.

# Limnohabitans MAG gene length distribution 
p_LIMNO_length <- easyGgplot2::ggplot2.histogram(data = LIMNO_gc_pfam, xName = 'gene_length',
                  groupName = 'Genome', alpha = 0.5,
                  legendPosition = "top", binwidth = 0.15,
                  groupColors = col_limno,addMeanLine=TRUE, meanLineColor="black",
                  meanLineType="dashed")+ theme_bw()+ ylim(0,25)+
  labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
  ggtitle("Limnohabitans MAG")+ xlim(0,3000)

# Bacteroidetes MAG1 gene length distribution 
p_BAC1_length <- easyGgplot2::ggplot2.histogram(data = BAC1_gc_pfam, xName = 'gene_length',
                  groupName = 'Genome', alpha = 0.5,
                  legendPosition = "top", binwidth = 0.15, 
                  groupColors = col_bac1,addMeanLine=TRUE, meanLineColor="black",
                  meanLineType="dashed")+ theme_bw()+ ylim(0,25)+
  labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
  ggtitle("Bacteroidetes MAG1")+ xlim(0,3000)

# Bacteroidetes MAG2 gene length distribution 
p_BAC2_length <- easyGgplot2::ggplot2.histogram(data = BAC2_gc_pfam, xName = 'gene_length',
                  groupName = 'Genome', alpha = 0.5,
                  legendPosition = "top", binwidth = 0.15,
                  groupColors = col_bac2,addMeanLine=TRUE, meanLineColor="black",
                  meanLineType="dashed")+ theme_bw()+ ylim(0,25)+
  labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="none")+
  ggtitle("Bacteroidetes MAG2")+ xlim(0,3000)

grid.arrange(p_LIMNO_length, p_BAC1_length, p_BAC2_length, ncol = 3)

5. Identify unique functional genes (COG/Pfams)

# Find unique functions in Limnohabitans MAG vs Bacteroidetes MAG1
unique_LIMNO_BAC1 <- dplyr::anti_join(LIMNO_gc_cog, BAC1_gc_cog, by = "cog_id")
cat("There are", paste(nrow(unique_LIMNO_BAC1)), "unique COGs in Limnohabitans MAG vs Bacteroidetes MAG1")
## There are 1178 unique COGs in Limnohabitans MAG vs Bacteroidetes MAG1
# Find unique functions in Limnohabitans MAG vs Bacteroidetes MAG2
unique_LIMNO_BAC2 <- dplyr::anti_join(LIMNO_gc_cog, BAC2_gc_cog, by = "cog_id")
cat("There are", paste(nrow(unique_LIMNO_BAC2)), "unique COGs in Limnohabitans MAG vs Bacteroidetes MAG2")
## There are 1143 unique COGs in Limnohabitans MAG vs Bacteroidetes MAG2
# Find unique functions in Bacteroidetes MAG1 vs Bacteroidetes MAG2
unique_BAC1_BAC2 <- dplyr::anti_join(BAC1_gc_cog, BAC2_gc_cog, by = "cog_id")
cat("There are", paste(nrow(unique_BAC1_BAC2)), "unique COGs in Bacteroidetes MAG1 vs Bacteroidetes MAG2")
## There are 153 unique COGs in Bacteroidetes MAG1 vs Bacteroidetes MAG2
# Find unique functions in Bacteroidetes MAG1 vs Bacteroidetes MAG2
unique_BAC2_BAC1 <- dplyr::anti_join(BAC2_gc_cog, BAC1_gc_cog, by = "cog_id")
cat("There are", paste(nrow(unique_BAC2_BAC1)), "unique COGs in Bacteroidetes MAG2 vs Bacteroidetes MAG1")
## There are 143 unique COGs in Bacteroidetes MAG2 vs Bacteroidetes MAG1
# Find unique functions in Limnohabitans MAG vs Bacteroidetes MAG1
unique_pfam_LIMNO_BAC1 <- dplyr::anti_join(LIMNO_gc_pfam, BAC1_gc_pfam, by = "pfam_id")
cat("There are", paste(nrow(unique_pfam_LIMNO_BAC1)), "unique Pfams in Limnohabitans MAG vs Bacteroidetes MAG1")
## There are 1474 unique Pfams in Limnohabitans MAG vs Bacteroidetes MAG1
# Find unique functions in Limnohabitans MAG vs Bacteroidetes MAG2
unique_pfam_LIMNO_BAC2 <- dplyr::anti_join(LIMNO_gc_pfam, BAC2_gc_pfam, by = "pfam_id")
cat("There are", paste(nrow(unique_pfam_LIMNO_BAC2)), "unique Pfams in Limnohabitans MAG vs Bacteroidetes MAG2")
## There are 1424 unique Pfams in Limnohabitans MAG vs Bacteroidetes MAG2
# Find unique functions in Bacteroidetes MAG1 vs Bacteroidetes MAG2
unique_pfam_BAC1_BAC2 <- dplyr::anti_join(BAC1_gc_pfam, BAC2_gc_pfam, by = "pfam_id")
cat("There are", paste(nrow(unique_pfam_BAC1_BAC2)), "unique Pfams in Bacteroidetes MAG1 vs Bacteroidetes MAG2")
## There are 285 unique Pfams in Bacteroidetes MAG1 vs Bacteroidetes MAG2
# Find unique functions in Bacteroidetes MAG1 vs Bacteroidetes MAG2
unique_pfam_BAC2_BAC1 <- dplyr::anti_join(BAC2_gc_pfam, BAC1_gc_pfam, by = "pfam_id")
cat("There are", paste(nrow(unique_pfam_BAC2_BAC1)), "unique Pfams in Bacteroidetes MAG2 vs Bacteroidetes MAG1")
## There are 233 unique Pfams in Bacteroidetes MAG2 vs Bacteroidetes MAG1

6. COG functional categories

Get COG ID to COG functional category mapping file here: ftp://ftp.ncbi.nih.gov/pub/wolf/COGs/COG0303/cogs.csv

The exact statistical analysis to compare genomes based on these profiles should be performed in STAMP.

# Import COG mapping file
cogid_2_cogcat <- read.csv("./Mapping_files/cogid_2_cogcat.csv", sep = ",", header = FALSE, fill = TRUE,col.names = c("COG_ID", "COG_class", "function"))[, 1:2]
cogid_2_cogcat <- cogid_2_cogcat[(cogid_2_cogcat$COG_class)!="", ]
cogid_2_cogcat <- droplevels(cogid_2_cogcat)

# Read COG category file
cog_categories <- read.table("./Mapping_files/cog_categories.tsv", header = TRUE, sep = "\t")

# Merge COG metadata
cog_meta <- dplyr::left_join(cog_categories, cogid_2_cogcat, by = c("COG_class" = "COG_class"))
cog_meta <- droplevels(cog_meta)

# Merge genome information of all genome bins
merged_gc_cog <- rbind(LIMNO_gc_cog, BAC1_gc_cog, BAC2_gc_cog)
merged_gc_cog <- data.frame(merged_gc_cog, genome_id = c(rep("Limnohabitans MAG", nrow(LIMNO_gc_cog)), 
                                             rep("Bacteroidetes MAG1", nrow(BAC1_gc_cog)),
                                             rep("Bacteroidetes MAG2", nrow(BAC2_gc_cog)))
                            )

# Merge this metadata with the genome data from before
# COGs with multiple classifications are currently still NA - work on this.
merged_gc_cog <- dplyr::left_join(merged_gc_cog, cog_meta, by = c("cog_id" = "COG_ID"))
merged_gc_cog <- merged_gc_cog[!is.na(merged_gc_cog$COG_functional_category),]

# Visualize distribution across major metabolism functional COG groups per genome.
p_cog_func_group <- ggplot(data = merged_gc_cog, 
                           aes(x=COG_functional_category, fill = COG_functional_cluster))+
  geom_bar(stat="count", width=0.7, color = "black", size = 0.75)+
  theme_bw()+
  facet_grid(genome_id~.)+
  scale_fill_brewer(palette = "Accent")+
  labs(x = "Gene length (bp)", y = "Count")+ 
  theme(legend.position="bottom", axis.text.x = element_text(angle = 90, hjust = 1),
                                                   legend.text = element_text(size = 5))+
  guides(fill=guide_legend(nrow=2,byrow=TRUE))

print(p_cog_func_group)

p_cog_func_clust <- ggplot(data = merged_gc_cog, 
                           aes(x=COG_functional_cluster, fill = COG_functional_cluster))+
  geom_bar(stat="count", width=0.7, color = "black", size = 0.75)+
  theme_bw()+
  facet_grid(genome_id~.)+
  scale_fill_brewer(palette = "Accent")+
  labs(x = "Gene length (bp)", y = "Count")+ 
  theme(legend.position="bottom",axis.text.x = element_text(angle = 90, hjust = 1),
                                                   legend.text = element_text(size = 5))+
  guides(fill=guide_legend(nrow=2,byrow=TRUE))

print(p_cog_func_clust)

7. KO pathways

# Import data
ko_path_df <- read.table("./Mapping_files/ko00001.keg", header = FALSE, sep = ";",
                      skip = 8, quote = "", fill = TRUE, 
                      col.names = c("Level", "KO", "Function_abbrev", "Function_spec"))
ko_path_df <- ko_path_df[1:(nrow(ko_path_df)-1), ] # remove tailing "!" at the end of file

# Remove empty rows
ko_path_df <- ko_path_df[!ko_path_df$KO == "", ]

# Extract pathways from dataframe and add them as new factor column
# pathw_hier <- data.frame(Level = ko_path_df$Level[ko_path_df$Level %in% c("A","B","C")], 
#                     Name = ko_path_df$KO[ko_path_df$Level %in% c("A","B","C")]
#                     )
# pathw_hier$Name <- as.character(pathw_hier$Name)
ko_path_df <- data.frame(ko_path_df, level_A = "A", level_B = "B", level_C = "C",
                         stringsAsFactors = FALSE)
ko_path_df$KO <- as.character(ko_path_df$KO)

# Get positions where to replicate higher hierarichcal level
pos_A <- c(c(1:nrow(ko_path_df))[ko_path_df$Level %in% "A"], nrow(ko_path_df)+1)
pos_B <- c(c(1:nrow(ko_path_df))[ko_path_df$Level %in% "B"], nrow(ko_path_df)+1)
pos_C <- c(c(1:nrow(ko_path_df))[ko_path_df$Level %in% "C"], nrow(ko_path_df)+1)


for(i in 1:(length(pos_A)-1)){
  ko_path_df$level_A[pos_A[i]:(pos_A[i+1]-1)] <- ko_path_df$KO[pos_A[i]]
}

for(i in 1:(length(pos_B)-1)){
  ko_path_df$level_B[pos_B[i]:(pos_B[i+1]-1)] <- ko_path_df$KO[pos_B[i]]
}

for(i in 1:(length(pos_C)-1)){
  ko_path_df$level_C[pos_C[i]:(pos_C[i+1]-1)] <- ko_path_df$KO[pos_C[i]]
}

# Remove all rows with level A, B, C - and level column
ko_path_df <- ko_path_df[!ko_path_df$Level %in% c("A", "B", "C"), ]
ko_path_df$level_A <- gsub(ko_path_df$level_A, pattern = "<b>|</b>", replacement = "")
ko_path_df$level_B <- gsub(ko_path_df$level_B, pattern = "<b>|</b>", replacement = "")
ko_path_df <- ko_path_df[, -1]

# Annotate merged ko file
merged_gc_ko <- rbind(LIMNO_gc_KO, BAC1_gc_KO, BAC2_gc_KO)
merged_gc_ko <- data.frame(merged_gc_ko, 
                           genome_id = c(rep("Limnohabitans MAG", nrow(LIMNO_gc_KO)),
                                         rep("Bacteroidetes MAG1", nrow(BAC1_gc_KO)),
                                         rep("Bacteroidetes MAG2", nrow(BAC2_gc_KO)))
                            )
merged_gc_ko$ko_id <- gsub(merged_gc_ko$ko_id, pattern = "KO:", replacement = "")
merged_gc_ko <- dplyr::left_join(merged_gc_ko, ko_path_df[, c(1, 4:6)], by = c("ko_id" = "KO"))

# Fill up NA slots with "Unknown" pathway
merged_gc_ko$level_A[is.na(merged_gc_ko$level_A)] <- "Unknown"
merged_gc_ko$level_B[is.na(merged_gc_ko$level_B)] <- "Unknown"
merged_gc_ko$level_C[is.na(merged_gc_ko$level_C)] <- "Unknown"

8. Synonymous Codon Usage Bias analysis using CodonO

  • Get CodonO for linux here: http://sysbio.cvm.msstate.edu/software/CodonO/download

  • Run CU.linux input.genes.fna
  • Output from CodonO:
    inputfile.ok —- the SCUO units for each input sequence based on its order
    inputfile.fik —- the composition ratio of the i-th amino acid in the k-th sequence
    inputfile.hijk —- the frequency of the j-th degenerate codon for amino acid i in each sequence

  • This output was not sufficient to perform any analysis. Therefore I used the web browser to calculate the synonymous codon bias in the genes of the three genomes.

** Codon bias is proportional to mRNA production! (Wan et al 2004)

** However, strong correlation between SCUO and %GC has been reported, thereby confounding possible “biological” effects.. Beware..

# Import and format data for Limnohabitans MAG
SCUO_LIMNO <- read.table("./IMG_annotation/IMG_2724679690_Limnohabitans/Annotation/2724679690.genes.fna.codonO.output",
                         sep = ",", blank.lines.skip = TRUE, allowEscapes = FALSE, skipNul = TRUE
                         )
SCUO_LIMNO$V1 <- gsub(SCUO_LIMNO$V1, pattern = "\t", replacement = "")
Gene_LIMNO <- do.call(rbind, strsplit(SCUO_LIMNO$V1[grep("Ga0*", SCUO_LIMNO$V1)], " "))[,2]
SCUO_LIMNO$V1 <- gsub(SCUO_LIMNO$V1, pattern = " ", replacement = "")
SCUO_LIMNO <- data.frame(GC = SCUO_LIMNO$V1[grep(x = SCUO_LIMNO$V1, pattern = "GC*.*=")], 
                         SCUO = rep(SCUO_LIMNO$V1[grep(x = SCUO_LIMNO$V1, pattern = "SCUO*")], each = 4),
                         Gene = rep(Gene_LIMNO, each = 4)
)
SCUO_LIMNO$GC <- as.numeric(gsub(SCUO_LIMNO$GC, pattern = ".*=", replacement = ""))
SCUO_LIMNO$SCUO <- as.numeric(gsub(SCUO_LIMNO$SCUO, pattern = ".*=", replacement = ""))

# Import and format data for Bacteroidetes MAG1
SCUO_BAC1 <- read.table("./IMG_annotation/IMG_2724679691_Bacteroidetes_bin1/Annotation/2724679691.genes.fna.codonO.output",
                         sep = ",", blank.lines.skip = TRUE, allowEscapes = FALSE, skipNul = TRUE)
SCUO_BAC1$V1 <- gsub(SCUO_BAC1$V1, pattern = "\t", replacement = "")
Gene_BAC1 <- do.call(rbind, strsplit(SCUO_BAC1$V1[grep("Ga0*", SCUO_BAC1$V1)], " "))[,2]
SCUO_BAC1$V1 <- gsub(SCUO_BAC1$V1, pattern = " ", replacement = "")
SCUO_BAC1 <- data.frame(GC = SCUO_BAC1$V1[grep(x = SCUO_BAC1$V1, pattern = "GC*.*=")], 
                         SCUO = rep(SCUO_BAC1$V1[grep(x = SCUO_BAC1$V1, pattern = "SCUO*")], each = 4),
                         Gene = rep(Gene_BAC1, each = 4)
)
SCUO_BAC1$GC <- as.numeric(gsub(SCUO_BAC1$GC, pattern = ".*=", replacement = ""))
SCUO_BAC1$SCUO <- as.numeric(gsub(SCUO_BAC1$SCUO, pattern = ".*=", replacement = ""))


# Import and format data for Bacteroidetes MAG2
SCUO_BAC2 <- read.table("./IMG_annotation/IMG_2724679698_Bacteroidetes_bin2/Annotation/2724679698.genes.fna.codonO.output",
                         sep = ",", blank.lines.skip = TRUE, allowEscapes = FALSE, skipNul = TRUE)
SCUO_BAC2$V1 <- gsub(SCUO_BAC2$V1, pattern = "\t", replacement = "")
Gene_BAC2 <- do.call(rbind, strsplit(SCUO_BAC2$V1[grep("Ga0*", SCUO_BAC2$V1)], " "))[,2]
SCUO_BAC2$V1 <- gsub(SCUO_BAC2$V1, pattern = " ", replacement = "")
SCUO_BAC2 <- data.frame(GC = SCUO_BAC2$V1[grep(x = SCUO_BAC2$V1, pattern = "GC*.*=")], 
                         SCUO = rep(SCUO_BAC2$V1[grep(x = SCUO_BAC2$V1, pattern = "SCUO*")], each = 4),
                         Gene = rep(Gene_BAC2, each = 4)
)
SCUO_BAC2$GC <- as.numeric(gsub(SCUO_BAC2$GC, pattern = ".*=", replacement = ""))
SCUO_BAC2$SCUO <- as.numeric(gsub(SCUO_BAC2$SCUO, pattern = ".*=", replacement = ""))

# Merge data to one dataframe
SCUO_merged_gen <- data.frame(rbind(SCUO_LIMNO, SCUO_BAC1, SCUO_BAC2),
           Genome_ID = c(rep("Limnohabitans MAG", nrow(SCUO_LIMNO)), rep("Bacteroidetes MAG1", nrow(SCUO_BAC1)), 
           rep("Bacteroidetes MAG2", nrow(SCUO_BAC2))),
            GCx = rep(c("GC_mean", "GC1", "GC2", "GC3"), 
                     (nrow(SCUO_LIMNO) + nrow(SCUO_BAC1) + nrow(SCUO_BAC2))/4
            )
)

# Merge codon bias data with KO pathway annotation
SCUO_merged <- dplyr::left_join(SCUO_merged_gen, merged_gc_ko[, c(1:2,4 ,14:21)], by = c("Gene" = "contig_geneID"))
          
# Visualize differences in codon bias
p_SCUO.1 <- ggplot(data = SCUO_merged, aes (x = 100*GC, y = SCUO, fill = Genome_ID))+
  geom_point(size = 4, shape = 21, alpha = 0.7)+
  scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
  theme_bw()+
  facet_wrap(~GCx, ncol = 2)+
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 0, hjust = 1),
        strip.text.x=element_text(size=18))+
  ylab("SCUO")+
  xlab("%GC")+
  ylim(0,1)

print(p_SCUO.1)

# Subset to genes for which ko annotation is available
SCUO_merged_sb <- SCUO_merged[!is.na(SCUO_merged$level_A), ]
SCUO_merged_sb <- SCUO_merged_sb[SCUO_merged_sb$GCx == "GC_mean", ]
# Look at pathways enriched in high %GC
# SCUO_merged_sb[]


SCUO_merged_gen_gcmean <- SCUO_merged_gen %>% dplyr::filter(GCx == "GC_mean")

p_SCUO.2 <- ggplot(data = SCUO_merged_gen_gcmean, aes (x = Genome_ID, y = SCUO))+
  geom_jitter(size = 3, alpha = 0.3, shape = 21, aes(fill = Genome_ID))+
  geom_boxplot(alpha=0, size =1.5, color = "darkorange")+
  # scale_fill_brewer(palette = "Accent")+
  scale_fill_manual(values = c(col_bac1, col_bac2, col_limno))+
  theme_bw()+
  # facet_wrap(Genome_ID~GCx)+
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.x=element_text(size=18))+
  ylab("Codon bias - SCUO")+
  xlab("")+
  ylim(0,1)+ 
  guides(fill=FALSE)+ 
  scale_x_discrete(labels=c("Bacteroidetes MAG1" = paste("Bacteroidetes MAG1 (n=",table(SCUO_merged_gen_gcmean$Genome_ID)[1],")", sep = ""),
                            "Bacteroidetes MAG2" = paste("Bacteroidetes MAG2 (n=",table(SCUO_merged_gen_gcmean$Genome_ID)[2],")", sep = ""),
                            "Limnohabitans MAG" = paste("Limnohabitans MAG (n=",table(SCUO_merged_gen_gcmean$Genome_ID)[3],")", sep = ""))
  )
# 
print(p_SCUO.2)

tmp <- SCUO_merged_sb$genome_id
tmp2 <- cbind(SCUO_merged_sb$ko_id, 
      c(rep(col_limno, table(tmp)[3]), rep(col_bac1, table(tmp)[1]), rep(col_bac2, table(tmp)[2]))
)

write.table(tmp2, file = "All_KO.tsv", quote = FALSE,
            col.names = FALSE, row.names = FALSE)

9. PosiGene analysis for identifying genes under positive selection in the Limnohabitans MAG

Phylogenetic tree used in PosiGene analysis. Red branch indicates the genome for which PSG were tested. Phylogenetic tree used in PosiGene analysis

# Import results_short file
data_posi <- read.table("./posigene_analysis/result_tables/Limnohabitans_MAG_results.tsv", header = TRUE, fill = TRUE, sep = "\t")[, c("Transcript", "P.Value","FDR", "HA.foreground.omega")]
colnames(data_posi)[1] <- "Gene"
data_posi$Gene <- as.character(data_posi$Gene)

# Import mapping file to link gene IDs from PosiGene to 
# those used by the IMG annotation (both are in headers from .genes.fna)
map_posi <- read.table("./Mapping_files/posiG_mapping.tsv")
colnames(map_posi) <- c("posi_geneID", "IMG_geneID")
map_posi$posi_geneID <- as.character(map_posi$posi_geneID)

# Filter out the genes that are under positive selection
# Taking threshold of adjusted p.value of 0.05 and FDR < 0.05
# Also remove dN/dS ratios of less than 15.
data_posi <- data_posi %>% filter(P.Value < 0.05 & 
                                    FDR < 0.05 & HA.foreground.omega < 15)
# Report summary
cat(paste("There are ", nrow(data_posi), " genes under positive selection in this MAG (P<0.05). This is ",round(100*nrow(data_posi)/nrow(map_posi),1), "% of all genes",  sep = "")
)
## There are 215 genes under positive selection in this MAG (P<0.05). This is 5.6% of all genes
# Merge this data with the functional annotation (e.g. KO) of these genes
data_posi <- left_join(data_posi, map_posi, by = c("Gene" = "posi_geneID"))
data_posi_KO <- left_join(data_posi, merged_gc_ko, by = c("IMG_geneID" = "contig_geneID"))

# Optional: write tabel for quick view in iPath v2
# write.table(file = "KO_posiG.tsv", unique(data_posi_KO$ko_id), quote = FALSE,
            # row.names = FALSE, col.names = FALSE)
# cat("Genes under positive selection with KO annotation")
# data_posi_KO[, c(1:6, 9:12, 20:29)]


# Focus on C-metabolism
data_posi_KO[data_posi_KO$ko_id %in% data_posi_KO$ko_id[data_posi_KO$level_B == "Carbohydrate metabolism"], ] %>% 
 ggplot(aes(x = ko_id, fill = level_C))+
  geom_bar(stat="count", width=0.7, color = "black", size = 0.75)+
  theme_bw()+
  # facet_wrap(~level_B, ncol = 2)+
  scale_fill_brewer(palette = "Paired")+
  labs(x = "", y = "Count")+ 
  theme(legend.position="bottom", axis.text.x = element_text(angle = 60, hjust = 1),
                                                   legend.text = element_text(size = 5))+
  guides(fill=guide_legend(nrow=6,byrow=TRUE))

# Focus on energy metabolism
data_posi_KO %>% dplyr::filter(level_B == "Energy metabolism") %>% 
 ggplot(aes(x = ko_id, fill = level_C))+
  geom_bar(stat="count", width=0.7, color = "black", size = 0.75)+
  theme_bw()+
  # facet_wrap(~level_B, ncol = 2)+
  scale_fill_brewer(palette = "Paired")+
  labs(x = "", y = "Count")+ 
  theme(legend.position="bottom", axis.text.x = element_text(angle = 60, hjust = 1),
                                                   legend.text = element_text(size = 5))+
  guides(fill=guide_legend(nrow=6,byrow=TRUE))

# Focus on translation
data_posi_KO %>% dplyr::filter(level_B == "Translation") %>% 
 ggplot(aes(x = ko_id, fill = level_C))+
  geom_bar(stat="count", width=0.7, color = "black", size = 0.75)+
  theme_bw()+
  # facet_wrap(~level_B, ncol = 2)+
  scale_fill_brewer(palette = "Paired")+
  labs(x = "", y = "Count")+ 
  theme(legend.position="bottom", axis.text.x = element_text(angle = 60, hjust = 1),
                                                   legend.text = element_text(size = 5))+
  guides(fill=guide_legend(nrow=6,byrow=TRUE))

# Focus on secondary metabolite production
data_posi_KO %>% dplyr::filter(level_B == "Metabolism of cofactors and vitamins") %>% 
 ggplot(aes(x = ko_id, fill = level_C))+
  geom_bar(stat="count", width=0.7, color = "black", size = 0.75)+
  theme_bw()+
  # facet_wrap(~level_B, ncol = 2)+
  scale_fill_brewer(palette = "Paired")+
  labs(x = "", y = "Count")+ 
  theme(legend.position="bottom", axis.text.x = element_text(angle = 60, hjust = 1),
                                                   legend.text = element_text(size = 5))+
  guides(fill=guide_legend(nrow=6,byrow=TRUE))

# Count number of unique genes involved in biosynthesis
# Correlate genes under positive selection with their GC content
data_SCUO_posi <- SCUO_merged_gen_gcmean %>% filter(Genome_ID == "Limnohabitans MAG")
data_SCUO_posi <- droplevels(data_SCUO_posi)
data_SCUO_posi$posi_select <- data_SCUO_posi$Gene %in% data_posi$IMG_geneID
data_SCUO_posi$posi_select <- factor(data_SCUO_posi$posi_select, levels = c("FALSE", "TRUE"))
# merge with dN/dS ratio data
data_SCUO_posi <- left_join(data_SCUO_posi,
                            data_posi, by = c("Gene" = "IMG_geneID"))
data_SCUO_posi_only <- data_SCUO_posi %>% filter(posi_select == "TRUE")

# Selected annotation level (KO)
selected_KOlevels <- c("Signal transduction",
"Amino acid metabolism",
"Metabolism of other amino acids",
"Metabolism of cofactors and vitamins",
"Carbohydrate metabolism",
"Energy metabolism",
"Translation",
"Membrane transport",
"Biosynthesis of other secondary metabolites",
"Lipid metabolism",
"Membrane transport",
"Unknown"
)
# Evaluate correlation between codon bias and selection pressure
p_SCUO_posi <- ggplot(data = data_SCUO_posi, aes (x = GC, y = SCUO))+
  geom_point(shape = 21, size = 3, fill = adjustcolor(col_limno, 0.1))+
  geom_point(shape = 21, size = 4, fill = "red",
             data = subset(data_SCUO_posi, posi_select == TRUE))+
  # geom_boxplot(alpha=0, size =1.5, color = "darkorange")+
  # scale_fill_brewer(palette = "Accent")+
  # scale_fill_manual(values = c(adjustcolor(col_limno, 0.1),  "darkorange"))+
  # scale_size_manual(values = c(2.5,4))+
  theme_bw()+
  # facet_wrap(Genome_ID~GCx)+
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.x=element_text(size=18))+
  ylab("Codon bias - SCUO")+ 
  xlab("%GC")+
  ylim(0,1)+ 
  guides(fill=FALSE, size = FALSE)

p_SCUO_posi

p_SCUO_posi2 <- ggplot(data = data_SCUO_posi, aes (x = posi_select, y = SCUO,
                                                   fill = posi_select))+
  # geom_jitter(shape = 21, aes(fill = posi_select, size = posi_select), width = 0.2)+
  geom_jitter(shape = 21, size = 3, width = 0.2, alpha = 0.5)+
  geom_boxplot(alpha=0.5, size =1.5, color = "darkgrey", width = 0.35,
               outlier.shape = NA)+
  # scale_fill_brewer(palette = "Accent")+
  scale_fill_manual(values = c(adjustcolor(col_limno, 0.1), adjustcolor("red", 0.5)))+
  # scale_size_manual(values = c(2.5,4))+
  theme_bw()+
  # facet_wrap(Genome_ID~GCx)+
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.x=element_text(size=18))+
  ylab("Codon bias - SCUO")+ 
  xlab("")+
  ylim(0,1)+ 
  guides(size = FALSE)

p_SCUO_posi2

p_SCUO_posi3 <- ggplot(data = data_SCUO_posi, aes (x = posi_select, y = GC,
                                                   fill = posi_select))+
  # geom_jitter(shape = 21, aes(fill = posi_select, size = posi_select), width = 0.2)+
  geom_jitter(shape = 21, size = 3, width = 0.2, alpha = 0.5)+
  geom_boxplot(alpha=0.5, size =1.5, color = "darkgrey", width = 0.35,
               outlier.shape = NA)+
  # scale_fill_brewer(palette = "Accent")+
  scale_fill_manual(values = c(adjustcolor(col_limno, 0.1), adjustcolor("red", 0.5)))+
  # scale_size_manual(values = c(2.5,4))+
  theme_bw()+
  # facet_wrap(Genome_ID~GCx)+
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.x=element_text(size=18))+
  ylab("%GC")+ 
  xlab("")+
  ylim(0.5,0.9)+ 
  guides(size = FALSE)

p_SCUO_posi3

p_SCUO_posi4 <- ggplot(data = data_SCUO_posi_only, aes (x = SCUO, y = sqrt(HA.foreground.omega)))+
  geom_point(shape = 21, size = 3, fill = adjustcolor(col_limno, 0.1))+
  # geom_point(shape = 21, size = 4, fill = "red",
             # data = subset(data_SCUO_posi, posi_select == TRUE))+
  # geom_boxplot(alpha=0, size =1.5, color = "darkorange")+
  # scale_fill_brewer(palette = "Accent")+
  # scale_fill_manual(values = c(adjustcolor(col_limno, 0.1),  "darkorange"))+
  # scale_size_manual(values = c(2.5,4))+
  theme_bw()+
  # facet_wrap(Genome_ID~GCx)+
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.x=element_text(size=18))+
  xlab("Codon bias - SCUO")+ 
  ylab(expression(sqrt(dN/dS)))+
  # ylim(0,1)+ 
  geom_hline(yintercept = sqrt(1), color = "darkgreen", size = 2, linetype = 2)+
  guides(fill=FALSE, size = FALSE)

p_SCUO_posi4

p_SCUO_posi5 <- ggplot(data = data_SCUO_posi_only, aes (x = 100*GC, y = sqrt(HA.foreground.omega)))+
  geom_point(shape = 21, size = 3, fill = adjustcolor(col_limno, 0.1))+
  # geom_point(shape = 21, size = 4, fill = "red",
             # data = subset(data_SCUO_posi, posi_select == TRUE))+
  # geom_boxplot(alpha=0, size =1.5, color = "darkorange")+
  # scale_fill_brewer(palette = "Accent")+
  # scale_fill_manual(values = c(adjustcolor(col_limno, 0.1),  "darkorange"))+
  # scale_size_manual(values = c(2.5,4))+
  theme_bw()+
  # facet_wrap(Genome_ID~GCx)+
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.x=element_text(size=18))+
  xlab("%GC")+ 
  ylab(expression(sqrt(dN/dS)))+
  # ylim(0,1)+ 
  geom_hline(yintercept = sqrt(1), color = "darkgreen", size = 2, linetype = 2)+
  guides(fill=FALSE, size = FALSE)

p_SCUO_posi5

p_SCUO_posi6 <- ggplot(data = data_posi_KO, aes (x = level_A, y = HA.foreground.omega,
                                                   fill = level_A))+
  # geom_jitter(shape = 21, aes(fill = posi_select, size = posi_select), width = 0.2)+
  geom_jitter(shape = 21, size = 3, width = 0.2, alpha = 0.5)+
  geom_boxplot(alpha=0.5, size =1, color = "black", width = 0.35,
               outlier.shape = NA)+
  scale_fill_brewer(palette = "Accent")+
  # scale_fill_manual(values = c(adjustcolor(col_limno, 0.1), adjustcolor("red", 0.5)))+
  # scale_size_manual(values = c(2.5,4))+
  theme_bw()+
  # facet_wrap(Genome_ID~GCx)+
  theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 60, hjust = 1),
        strip.text.x=element_text(size=18),
        legend.position = "bottom")+
  xlab("")+ 
  ylab(expression(dN/dS))+
  # ylim(0.5,0.9)+ 
  guides(size = FALSE, fill = FALSE)

p_SCUO_posi6

# Extract level_C labels of genes in top 5 dN/ds ratio
# First filter out the genes with multiple annotations by
# selecting the annotation with the highest identity.
# data_posi_KO_unique <- data_posi_KO %>% group_by(Gene) %>%
  # filter(percent_identity == max(percent_identity))
  
top <- 10
thresh <- sort(data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega, decreasing = TRUE)[top+1]
selected_KOlevels_labels <- data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$level_C
selected_KOlevels_labels[data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega < thresh] <- ""

# Remove everything between brackets ([*])
selected_KOlevels_labels <- gsub(pattern = " *\\[.*?\\] *", 
                                 replacement = "", selected_KOlevels_labels)
# Remove everything before first space
selected_KOlevels_labels <- sub(".*? (.+)", "\\1", selected_KOlevels_labels)

p_SCUO_posi7 <- data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ] %>% 
  ggplot(aes(x = level_B, y = HA.foreground.omega,
                                                   fill = level_B))+
  # geom_jitter(shape = 21, aes(fill = posi_select, size = posi_select), width = 0.2)+
  geom_jitter(shape = 21, size = 3, width = 0.2, alpha = 0.5)+
  geom_boxplot(alpha=0.5, size =1, color = "black", width = 0.35,
               outlier.shape = NA)+
  scale_fill_brewer(palette = "Set3")+
  # scale_fill_manual(values = c(adjustcolor(col_limno, 0.1), adjustcolor("red", 0.5)))+
  # scale_size_manual(values = c(2.5,4))+
  theme_bw()+
  # facet_wrap(Genome_ID~GCx)+
  theme(axis.text=element_text(size=12.5), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 60, hjust = 1),
        strip.text.x=element_text(size=18),
        legend.position = "bottom")+
  xlab("")+ 
  ylab(expression(dN/dS))+
  # ylim(0.5,0.9)+ 
  guides(size = FALSE, fill = FALSE)+
  geom_label_repel(aes(label = selected_KOlevels_labels), fontface = 'bold', color = 'black',
                   box.padding = 0.35, point.padding = 0.5,
                   segment.color = "#3F4656",
                   size = 3,
                       # Width of the line segments.
                   segment.size = 1,
                   # Draw an arrow from the label to the data point.
                   arrow = arrow(length = unit(0.02, 'npc'), type = "closed"),
                   nudge_x = -0.1,
                   nudge_y = 0.6,
                   force = 10
                   
  )+
  ylim(0,25)

p_SCUO_posi7

# Extract level_C labels of genes in top 5 dN/ds ratio
top <- 10
thresh <- sort(data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega, decreasing = TRUE)[top+1]
selected_KOlevels_labels <- data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$ko_name
selected_KOlevels_labels[data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ]$HA.foreground.omega < thresh] <- ""

# Remove everything between brackets ([*])
selected_KOlevels_labels <- gsub(pattern = " *\\[.*?\\] *", 
                                 replacement = "", selected_KOlevels_labels)

p_SCUO_posi8 <- data_posi_KO[data_posi_KO$level_B %in% selected_KOlevels, ] %>% 
  ggplot(aes(x = level_B, y = HA.foreground.omega,
                                                   fill = level_B))+
  # geom_jitter(shape = 21, aes(fill = posi_select, size = posi_select), width = 0.2)+
  geom_jitter(shape = 21, size = 3, width = 0.2, alpha = 0.5)+
  geom_boxplot(alpha=0.5, size =1, color = "black", width = 0.35,
               outlier.shape = NA)+
  scale_fill_brewer(palette = "Set3")+
  # scale_fill_manual(values = c(adjustcolor(col_limno, 0.1), adjustcolor("red", 0.5)))+
  # scale_size_manual(values = c(2.5,4))+
  theme_bw()+
  # facet_wrap(Genome_ID~GCx)+
  theme(axis.text=element_text(size=12.5), axis.title=element_text(size=20),
        title=element_text(size=20), legend.text=element_text(size=14),
        legend.background = element_rect(fill="transparent"),
        axis.text.x = element_text(angle = 60, hjust = 1),
        strip.text.x=element_text(size=18),
        legend.position = "bottom")+
  xlab("")+ 
  ylab(expression(dN/dS))+
  # ylim(0.5,0.9)+ 
  guides(size = FALSE, fill = FALSE)+
  geom_label_repel(aes(label = selected_KOlevels_labels), fontface = 'bold', color = 'black',
                   box.padding = 0.35, point.padding = 0.5,
                   segment.color = "#3F4656",
                   size = 3,
                       # Width of the line segments.
                   segment.size = 1,
                   # Draw an arrow from the label to the data point.
                   arrow = arrow(length = unit(0.02, 'npc'), type = "closed"),
                   nudge_x = -0.1,
                   nudge_y = 0.6,
                   force = 10
                   
  )+
  ylim(0,25)

p_SCUO_posi8

10. Plasmid reconstruction using Recycler

files <- list.files("./recycler_plasmid/", pattern="*_length_cov")
files <- paste0("./recycler_plasmid/", files)
df_plasm_16 <- read.table(files[1], header = FALSE)
df_plasm_17 <- read.table(files[2], header = FALSE)
df_plasm_64 <- read.table(files[3], header = FALSE)
df_plasm_65 <- read.table(files[4], header = FALSE)
df_plasm <- rbind(df_plasm_16, df_plasm_17, df_plasm_64, df_plasm_65)

df_plasm <- as.character(df_plasm$V1); tmp <- df_plasm
df_plasm <- do.call(rbind, strsplit(df_plasm, "_"))
df_plasm <- df_plasm[, -c(1,3,5,7,8)]
colnames(df_plasm) <- c("NODE_ID", "Length", "avg_coverage")
df_plasm <- apply(df_plasm, 2, function(x) as.integer(x))
df_plasm <- data.frame(df_plasm, node_IDs = tmp)

# Filter out plasmid scaffolds lower than 1000 bp
p_plasm1 <- df_plasm %>% filter(Length > 1000) %>% 
  ggplot(aes(x = Length/1000, y = sqrt(avg_coverage), fill = Length))+
  geom_point(shape = 21, size = 4)+ theme_bw()+
  # scale_x_log10()+
  # scale_y_log10()+
  scale_fill_continuous()+
  ylab(expression(sqrt(Average_coverage)))+
  xlab("Length (kbp)")+
  ggtitle("Coverage-Length distribution of reconstructed plasmid fragments")

p_plasm1